Set up

set.seed(2021)

library(MouseGastrulationData)
library(ggplot2)
library(reshape)
library(org.Mm.eg.db)
library(scater)
library(patchwork)

Load functions

source("../scripts/StabMap_functions.R")

Load E6.5, E7.5 and E8.5 data using MouseGastrulationData

mt = MouseGastrulationData::AtlasSampleMetadata
samples = mt$sample[mt$stage %in% c("E6.5", "E7.5", "E8.5")]
SCE <- EmbryoAtlasData(samples = samples)
SCE
## class: SingleCellExperiment 
## dim: 29452 37551 
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(37551): cell_1 cell_2 ... cell_139330 cell_139331
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
SCE <- logNormCounts(SCE)
celltype_colours = MouseGastrulationData::EmbryoCelltypeColours
celltype_colours
##                       Epiblast               Primitive Streak 
##                      "#635547"                      "#DABE99" 
##                Caudal epiblast                            PGC 
##                      "#9E6762"                      "#FACB12" 
##      Anterior Primitive Streak                      Notochord 
##                      "#C19F70"                      "#0F4A9C" 
##                  Def. endoderm                            Gut 
##                      "#F397C0"                      "#EF5A9D" 
##               Nascent mesoderm                 Mixed mesoderm 
##                      "#C594BF"                      "#DFCDE4" 
##          Intermediate mesoderm                Caudal Mesoderm 
##                      "#139992"                      "#3F84AA" 
##              Paraxial mesoderm               Somitic mesoderm 
##                      "#8DB5CE"                      "#005579" 
##            Pharyngeal mesoderm                 Cardiomyocytes 
##                      "#C9EBFB"                      "#B51D8D" 
##                      Allantois                   ExE mesoderm 
##                      "#532C8A"                      "#8870AD" 
##                     Mesenchyme Haematoendothelial progenitors 
##                      "#CC7818"                      "#FBBE92" 
##                    Endothelium            Blood progenitors 1 
##                      "#FF891C"                      "#F9DECF" 
##            Blood progenitors 2                     Erythroid1 
##                      "#C9A997"                      "#C72228" 
##                     Erythroid2                     Erythroid3 
##                      "#F79083"                      "#EF4E22" 
##                            NMP           Rostral neurectoderm 
##                      "#8EC792"                      "#65A83E" 
##            Caudal neurectoderm                   Neural crest 
##                      "#354E23"                      "#C3C388" 
##   Forebrain/Midbrain/Hindbrain                    Spinal cord 
##                      "#647A4F"                      "#CDE088" 
##               Surface ectoderm              Visceral endoderm 
##                      "#F7F79E"                      "#F6BFCB" 
##                   ExE endoderm                   ExE ectoderm 
##                      "#7F6874"                      "#989898" 
##              Parietal endoderm 
##                      "#1A1A1A"

Randomly subset 5000 cells from this data, for speed reasons

cells_thin = sample(colnames(SCE), 5000)
SCE_thin = SCE[,cells_thin]
SCE_thin
## class: SingleCellExperiment 
## dim: 29452 5000 
## metadata(0):
## assays(2): counts logcounts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(5000): cell_99688 cell_48887 ... cell_47358 cell_42653
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):

Get gene names for seqFISH data

Account for gene name difference for Cavin3/Prkcdbp. Then convert to the ENSEMBL gene IDs.

seqFISH_gene_symbols = c(scan(what = "character", file = "../data/seqFISH_genes.txt"), "Prkcdbp")
table(seqFISH_gene_symbols %in% rowData(SCE_thin)[,2]) # only one missing, which is accounted for
## 
## FALSE  TRUE 
##     1   351
seqFISH_genes = rownames(SCE_thin)[rowData(SCE_thin)[,2] %in% seqFISH_gene_symbols]
length(seqFISH_genes)
## [1] 351

Generate similarity for all genes

This will be re-used multiple times. Note this is a dense matrix, as output from igraph::similarity.

full_sim = generateSimilarity(SCE_thin, batchFactor = SCE_thin$sample)
dim(full_sim)
## [1] 5000 5000
full_sim[1:5,1:5]
##             cell_99688 cell_48887 cell_138969 cell_47896 cell_137543
## cell_99688           1          0           0          0           0
## cell_48887           0          1           0          0           0
## cell_138969          0          0           1          0           0
## cell_47896           0          0           0          1           0
## cell_137543          0          0           0          0           1

Calculate uncertainty scores for seqFISH genes

plotAdditional = list("celltype", 
                      list(scale_colour_manual(values = celltype_colours),
                           theme(legend.position = "bottom")))
scores_seqFISH = getSubsetUncertainty(SCE_thin,
                                      subsetGenes = seqFISH_genes,
                                      full_sim = full_sim,
                                      plot = TRUE,
                                      plotAdditional = plotAdditional)

Some exploration of these scores

scores_df = data.frame(cell = colnames(SCE_thin),
                       score = scores_seqFISH,
                       celltype = SCE_thin$celltype)
ggplot(scores_df, aes(x = celltype, y = score)) + 
  geom_boxplot(aes(fill = celltype)) +
  theme_classic() +
  scale_fill_manual(values = celltype_colours) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("") +
  theme(legend.position = "bottom") +
  NULL

Estimate scores for the entire dataset

I can also estimate the uncertainty for the full data, given the subset. This means that I can estimate the uncertainty for all cells, but only pull from the 5000x5000 similarity distribution from the random subset of cells. This is of course an estimate, but it’s worthwhile knowing one can circumvent needing to calculate the similarity across all cells.

joint_sample = SCE$sample
names(joint_sample) <- colnames(SCE)
scores_all = getSubsetUncertainty(SCE_thin,
                                  querySCE = SCE[, setdiff(colnames(SCE), colnames(SCE_thin))],
                                  subsetGenes = seqFISH_genes,
                                  full_sim = full_sim,
                                  plot = TRUE,
                                  plotAdditional = plotAdditional,
                                  jointBatchFactor = joint_sample)

length(scores_all)
## [1] 37551
head(scores_all)
##  cell_99688  cell_48887 cell_138969  cell_47896 cell_137543  cell_49005 
##  0.34612245  0.10040816  0.32163265  0.06285714  0.38857143  0.16816327
scores_all_df = data.frame(cell = names(scores_all),
                       score = scores_all,
                       celltype = SCE[,names(scores_all)]$celltype,
                       sample = SCE[,names(scores_all)]$sample,
                       score_sub = scores_seqFISH[names(scores_all)])

dim(scores_all_df)
## [1] 37551     5
head(scores_all_df)
##                    cell      score                     celltype sample
## cell_99688   cell_99688 0.34612245 Forebrain/Midbrain/Hindbrain     29
## cell_48887   cell_48887 0.10040816          Blood progenitors 1     19
## cell_138969 cell_138969 0.32163265          Pharyngeal mesoderm     37
## cell_47896   cell_47896 0.06285714                         <NA>     19
## cell_137543 cell_137543 0.38857143 Forebrain/Midbrain/Hindbrain     37
## cell_49005   cell_49005 0.16816327             Nascent mesoderm     19
##             score_sub
## cell_99688  0.5240816
## cell_48887  0.1779592
## cell_138969 0.5722449
## cell_47896  0.4008163
## cell_137543 0.6424490
## cell_49005  0.5093878
ggplot(scores_all_df, aes(x = score_sub, y = score)) + 
  geom_point(aes(colour = celltype)) + 
  theme_classic() +
  scale_colour_manual(values = celltype_colours) +
  geom_abline(slope = 1, intercept = 0) +
  theme(legend.position = "bottom") +
  NULL

ggplot(scores_all_df, aes(x = interaction(sample, celltype), y = score)) + 
  geom_boxplot(aes(fill = celltype)) + 
  theme_classic() +
  scale_fill_manual(values = celltype_colours) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("") +
  theme(legend.position = "bottom") +
  NULL

Certain cell types do appear to show a higher level of uncertainty, e.g.  Allantois and Epiblast.

Which genes mark differences between cells with high ambiguity?

Here I perform a clustering of the data using the seqFISH genes, extract the clusters with the highest uncertainty values, then re-cluster these cells using the entire transcriptome. Presumably, the markers for these re-clusters represent the gene expression differences that are as yet unaccounted for within the seqFISH gene set.

SCE_sub <- logNormCounts(SCE[seqFISH_genes,])
fit = modelGeneVar(logcounts(SCE_sub))
HVGs = getTopHVGs(fit)
length(HVGs)
## [1] 181
SCE_sub <- runPCA(SCE_sub, subset_row = HVGs)
SCE_sub_corrected <- fastMNN(SCE_sub, batch = SCE_sub$sample)
PCs_sub = reducedDim(SCE_sub_corrected, "corrected")
reducedDim(SCE_sub, "UMAP") <- calculateUMAP(t(PCs_sub))

cl = clusterSNNGraph(SCE_sub_corrected, use.dimred = "corrected")
table(cl)
## cl
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2998  194 2221 3065 1549 4432 2885 3120 1195 5258 4583  730  909 1777 1138  149 
##   17   18   19   20   21   22   23   24 
##  142   32   13   30  640  360   88   43
SCE_sub$cluster_seqFISH = paste0("seqFISH_cluster_", cl)
SCE_sub$score = scores_all_df[colnames(SCE_sub), "score"]

plotUMAP(SCE_sub, colour_by = "cluster_seqFISH") + scale_colour_discrete() + ggtitle("Clusters using seqFISH genes")

plotUMAP(SCE_sub, colour_by = "score")

plotUMAP(SCE_sub, colour_by = "celltype") + scale_colour_manual(values = celltype_colours)

scores_all_df$cluster_seqFISH = SCE_sub[,names(scores_all)]$cluster_seqFISH

ggplot(scores_all_df, aes(x = cluster_seqFISH, y = score)) + 
  geom_boxplot(aes(fill = cluster_seqFISH)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  NULL

There are clusters that have a high uncertainty value, while others remain fairly low. This means that for these clusters, there may be additional genes that are needed to better estimate the cell-cell neighbourhoods.

Re-cluster the cells within these high uncertainty clusters using all genes, and extract their marker genes. Visualise these on the full UMAP and examine what genes they are.

clusters_ordered_uncertainty = names(sort(tapply(SCE_sub$score, SCE_sub$cluster_seqFISH, median), decreasing = TRUE))
clusters_ordered_uncertainty
##  [1] "seqFISH_cluster_21" "seqFISH_cluster_15" "seqFISH_cluster_10"
##  [4] "seqFISH_cluster_11" "seqFISH_cluster_1"  "seqFISH_cluster_8" 
##  [7] "seqFISH_cluster_4"  "seqFISH_cluster_6"  "seqFISH_cluster_14"
## [10] "seqFISH_cluster_2"  "seqFISH_cluster_20" "seqFISH_cluster_22"
## [13] "seqFISH_cluster_3"  "seqFISH_cluster_18" "seqFISH_cluster_24"
## [16] "seqFISH_cluster_17" "seqFISH_cluster_23" "seqFISH_cluster_5" 
## [19] "seqFISH_cluster_13" "seqFISH_cluster_19" "seqFISH_cluster_7" 
## [22] "seqFISH_cluster_12" "seqFISH_cluster_9"  "seqFISH_cluster_16"
minFDRs_list = list()

for (i in 1:5) {
  print(i)
  
  SCE_sub$cluster_seqFISH_tmp = SCE_sub$cluster_seqFISH == clusters_ordered_uncertainty[i]
  g = plotUMAP(SCE_sub, colour_by = "cluster_seqFISH_tmp") +
    scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
    ggtitle(paste("Clusters using seqFISH genes on seqFISH genes UMAP, ",
                  clusters_ordered_uncertainty[i]))
  print(g)
  
  g = plotReducedDim(SCE_sub, dimred = "umap", colour_by = "cluster_seqFISH_tmp") +
    scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
    ggtitle(paste("Clusters using seqFISH genes on full data UMAP, ",
                  clusters_ordered_uncertainty[i])) +
    coord_fixed()
  print(g)
  
  table(SCE_sub$celltype, SCE_sub$cluster_seqFISH_tmp)
  
  # cluster within these cells
  SCE_full = SCE[,colnames(SCE_sub)[SCE_sub$cluster_seqFISH_tmp]]
  SCE_full <- logNormCounts(SCE_full)
  fit = modelGeneVar(logcounts(SCE_full), block = SCE_full$sample)
  HVGs = getTopHVGs(fit)
  length(HVGs)
  SCE_full <- runPCA(SCE_full, subset_row = HVGs)
  SCE_full_corrected <- fastMNN(SCE_full, batch = SCE_full$sample)
  SCE_full_clusters = clusterSNNGraph(SCE_full_corrected, use.dimred = "corrected")
  table(SCE_full_clusters)
  SCE_full$full_cluster = SCE_full_clusters
  
  full_cluster_markers_array = getMarkerArray(SCE_full, "full_cluster")
  minFDRs = rowMins(full_cluster_markers_array[,,"FDR"], na.rm = TRUE)
  names(minFDRs) <- dimnames(full_cluster_markers_array)[[1]]
  
  minFDRs_list[[i]] <- minFDRs
  
  sort(minFDRs)[1:5]
  table(minFDRs < 0.01)
  head(minFDRs[minFDRs < 0.01])
  rowData(SCE_full)$SYMBOL[order(minFDRs)[1:5]]
  
  gList = sapply(rowData(SCE_full)$SYMBOL[order(minFDRs)[1:9]], function(gene) {
    plotReducedDim(SCE, dimred = "umap", 
                   # colour_by = rowData(SCE_full)$SYMBOL[order(minFDRs)[1]],
                   colour_by = gene,
                   swap_rownames = "SYMBOL",
                   by_exprs_values = "logcounts") +
      coord_fixed()
  }, simplify = FALSE)
  g = wrap_plots(gList, nrow = 3, ncol = 3)
  print(g)
}
## [1] 1

## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"

## [1] 2

## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"

## [1] 3

## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"

## [1] 4

## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] "10"
## [1] "11"
## [1] "12"
## [1] "13"
## [1] "14"

## [1] 5

## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] "10"
## [1] "11"
## [1] "12"

Overall it appears that cell cycle genes are lacking, as well as some relevant genes for NMPs, e.g. Hes1, Nkx1-2. A decent number of these genes are also in the smFISH set, so it’s worth seeing what difference these genes make and to which cells in terms of their neighbourhood similarities.

sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /usr/local/Cellar/openblas/0.3.13/lib/libopenblasp-r0.3.13.dylib
## LAPACK: /usr/local/Cellar/r/4.0.4/lib/R/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BiocNeighbors_1.8.2         batchelor_1.6.2            
##  [3] igraph_1.2.6                bluster_1.0.0              
##  [5] scran_1.18.3                patchwork_1.1.1            
##  [7] scater_1.18.3               org.Mm.eg.db_3.12.0        
##  [9] AnnotationDbi_1.52.0        reshape_0.8.8              
## [11] ggplot2_3.3.3               MouseGastrulationData_1.4.0
## [13] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [15] Biobase_2.50.0              GenomicRanges_1.42.0       
## [17] GenomeInfoDb_1.26.2         IRanges_2.24.1             
## [19] S4Vectors_0.28.1            BiocGenerics_0.36.0        
## [21] MatrixGenerics_1.2.0        matrixStats_0.57.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                  bit64_4.0.5                  
##  [3] RcppAnnoy_0.0.18              httr_1.4.2                   
##  [5] tools_4.0.4                   ResidualMatrix_1.0.0         
##  [7] R6_2.5.0                      irlba_2.3.3                  
##  [9] vipor_0.4.5                   uwot_0.1.10                  
## [11] DBI_1.1.1                     colorspace_2.0-0             
## [13] withr_2.4.1                   tidyselect_1.1.0             
## [15] gridExtra_2.3                 bit_4.0.4                    
## [17] curl_4.3                      compiler_4.0.4               
## [19] DelayedArray_0.16.1           labeling_0.4.2               
## [21] scales_1.1.1                  rappdirs_0.3.3               
## [23] stringr_1.4.0                 digest_0.6.27                
## [25] rmarkdown_2.6                 XVector_0.30.0               
## [27] pkgconfig_2.0.3               htmltools_0.5.1.1            
## [29] sparseMatrixStats_1.2.0       limma_3.46.0                 
## [31] dbplyr_2.1.0                  fastmap_1.1.0                
## [33] rlang_0.4.10                  RSQLite_2.2.3                
## [35] shiny_1.6.0                   DelayedMatrixStats_1.12.2    
## [37] farver_2.0.3                  generics_0.1.0               
## [39] BiocParallel_1.24.1           dplyr_1.0.3                  
## [41] RCurl_1.98-1.2                magrittr_2.0.1               
## [43] BiocSingular_1.6.0            GenomeInfoDbData_1.2.4       
## [45] scuttle_1.0.4                 Matrix_1.3-2                 
## [47] Rcpp_1.0.6                    ggbeeswarm_0.6.0             
## [49] munsell_0.5.0                 viridis_0.5.1                
## [51] lifecycle_0.2.0               edgeR_3.32.1                 
## [53] stringi_1.5.3                 yaml_2.2.1                   
## [55] zlibbioc_1.36.0               plyr_1.8.6                   
## [57] BiocFileCache_1.14.0          AnnotationHub_2.22.0         
## [59] grid_4.0.4                    blob_1.2.1                   
## [61] dqrng_0.2.1                   promises_1.1.1               
## [63] ExperimentHub_1.16.0          crayon_1.3.4                 
## [65] lattice_0.20-41               cowplot_1.1.1                
## [67] beachmat_2.6.4                locfit_1.5-9.4               
## [69] knitr_1.30                    pillar_1.4.7                 
## [71] codetools_0.2-18              glue_1.4.2                   
## [73] BiocVersion_3.12.0            evaluate_0.14                
## [75] BiocManager_1.30.10           vctrs_0.3.6                  
## [77] httpuv_1.5.5                  gtable_0.3.0                 
## [79] purrr_0.3.4                   assertthat_0.2.1             
## [81] cachem_1.0.1                  xfun_0.20                    
## [83] rsvd_1.0.3                    mime_0.9                     
## [85] xtable_1.8-4                  RSpectra_0.16-0              
## [87] later_1.1.0.1                 viridisLite_0.3.0            
## [89] tibble_3.0.5                  beeswarm_0.2.3               
## [91] memoise_2.0.0                 statmod_1.4.35               
## [93] ellipsis_0.3.1                interactiveDisplayBase_1.28.0